Heirarchy

Preliminary steps

Load libraries

Load files

Obtain the heirarchy

Preprocess the data

Transform the data to generate a dataframe of 40 variables for each technique

# Transform the data
wide_data <- meanKLC %>%
  unite("noise_percentage", noise, percentage, sep = "_") %>%
  spread(key = noise_percentage, value = kappa_loss)

# View the transformed data
print(wide_data)
## # A tibble: 20 × 133
##    technique   `0_0` `0_10` `0_100` `0_20` `0_30` `0_40` `0_50` `0_60` `0_70`
##    <chr>       <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 bayesglm        0      0       0      0      0      0      0      0      0
##  2 C5.0            0      0       0      0      0      0      0      0      0
##  3 ctree           0      0       0      0      0      0      0      0      0
##  4 fda             0      0       0      0      0      0      0      0      0
##  5 gbm             0      0       0      0      0      0      0      0      0
##  6 gcvEarth        0      0       0      0      0      0      0      0      0
##  7 JRip            0      0       0      0      0      0      0      0      0
##  8 knn             0      0       0      0      0      0      0      0      0
##  9 lvq             0      0       0      0      0      0      0      0      0
## 10 mlpML           0      0       0      0      0      0      0      0      0
## 11 multinom        0      0       0      0      0      0      0      0      0
## 12 naive_bayes     0      0       0      0      0      0      0      0      0
## 13 PART            0      0       0      0      0      0      0      0      0
## 14 rbfDDA          0      0       0      0      0      0      0      0      0
## 15 rda             0      0       0      0      0      0      0      0      0
## 16 rf              0      0       0      0      0      0      0      0      0
## 17 rfRules         0      0       0      0      0      0      0      0      0
## 18 rpart           0      0       0      0      0      0      0      0      0
## 19 simpls          0      0       0      0      0      0      0      0      0
## 20 svmRadial       0      0       0      0      0      0      0      0      0
## # ℹ 123 more variables: `0_80` <dbl>, `0_90` <dbl>, `10_0` <dbl>,
## #   `10_10` <dbl>, `10_100` <dbl>, `10_20` <dbl>, `10_30` <dbl>, `10_40` <dbl>,
## #   `10_50` <dbl>, `10_60` <dbl>, `10_70` <dbl>, `10_80` <dbl>, `10_90` <dbl>,
## #   `100_0` <dbl>, `100_10` <dbl>, `100_100` <dbl>, `100_20` <dbl>,
## #   `100_30` <dbl>, `100_40` <dbl>, `100_50` <dbl>, `100_60` <dbl>,
## #   `100_70` <dbl>, `100_80` <dbl>, `100_90` <dbl>, `20_0` <dbl>,
## #   `20_10` <dbl>, `20_100` <dbl>, `20_20` <dbl>, `20_30` <dbl>, …

Obtain distance matrix

Compute the distance matrix

Find the optimal number of clusters

Optimal clusters with Elbow

Optimal clusters with Silhouette

Omitted calculating PCA and NbClust given errors in Model_Heirarchy.Rmd

Perform Heriarchical Clustering

Optimal K determined as 2.

# Perform hierarchical clustering
hclusters <- hclust(distance_matrix, method = "ward.D")

# Cut the tree to get k=4 clusters
k <- 2
clusters <- cutree(hclusters, k = k)

# Print cluster assignments
print(clusters)
##  [1] 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 2 1
# First get unique techniques in the same order as used for clustering
techniques <- wide_data$technique

# Create the mapping dataframe
technique_clusters <- data.frame(
  technique = techniques,
  cluster = clusters
)

# Create a named vector to map colors to specific clusters
# This ensures consistent color usage across all plots
cluster_colors <- c(
  "1" = "#4FB28F",  # Green
  "2" = "#F65215"   # Orange
)

# Save the dendrogram with colored rectangles by cluster
png("../../results/plots/dendogram2.png", width = 4000, height = 3000, res = 600)
plot(hclusters, hang = -1, labels = wide_data$technique, 
     main = paste("Hierarchical Grouping (k =", k, ")"), 
     xlab = "Observations", sub = NULL)

# Create colored rectangles with consistent colors per cluster
rect.hclust(hclusters, k = k, border = cluster_colors[as.character(1:k)])
invisible(dev.off())

# Generate silhouette plot with consistent colors
png("../../results/plots/silhouette_t2.png", width = 4000, height = 3000, res = 600)
sil <- silhouette(clusters, dist = distance_matrix)

# Use the same colors for silhouette plot as for dendrogram
plot(sil, col = cluster_colors[as.character(sort(unique(clusters)))], 
     main = paste("Silhouette Plot (k =", k, ")"))
invisible(dev.off())

Plots

# Join cluster assignments with original data
meanKLC_with_clusters <- meanKLC %>%
  left_join(technique_clusters, by = "technique")

# Calculate mean kappa loss for each cluster, noise level, and percentage
cluster_means <- meanKLC_with_clusters %>%
  group_by(cluster, noise, percentage) %>%
  summarize(kappa_loss = round(mean(kappa_loss, na.rm = TRUE), 2), .groups = 'drop')

print(cluster_means)
## # A tibble: 264 × 4
##    cluster noise percentage kappa_loss
##      <int> <dbl>      <dbl>      <dbl>
##  1       1     0          0          0
##  2       1     0         10          0
##  3       1     0         20          0
##  4       1     0         30          0
##  5       1     0         40          0
##  6       1     0         50          0
##  7       1     0         60          0
##  8       1     0         70          0
##  9       1     0         80          0
## 10       1     0         90          0
## # ℹ 254 more rows
# Create plots for individual techniques (optional)
for(instance in instances_names) {
  # Filter data for the current instance percentage
  filtered_data <- subset(meanKLC_with_clusters, percentage == instance)
  
  # Create plot with consistent colors
  p1 <- ggplot(filtered_data, aes(x = noise, y = kappa_loss, color = factor(cluster))) +
    geom_point() +
    geom_line(aes(group = technique)) +
    # Use consistent colors based on cluster assignment
    scale_color_manual(values = cluster_colors) +
    labs(x = "Noise", y = "Kappa Loss", color = "Cluster") +
    ggtitle(paste0("Kappa Loss Curves by technique, noise and ", instance, " % of instances altered")) +
    theme_bw() +
    scale_y_continuous(limits = c(0.0, 0.5), breaks = seq(0, 1, by = 0.1))
  
  # Print plot
  print(p1)
}

# Create plots for cluster means
for(instance in instances_names) {
  # Filter data for the current instance percentage
  filtered_data <- subset(cluster_means, percentage == instance)
  
  # Create plot with consistent colors
  p2 <- ggplot(filtered_data, aes(x = noise, y = kappa_loss, color = factor(cluster))) +
    geom_point() +
    geom_line(aes(group = cluster)) +
    # Use consistent colors based on cluster assignment
    scale_color_manual(values = cluster_colors) +
    labs(x = "Noise", y = "Kappa Loss", color = "Cluster") +
    ggtitle(paste0("Kappa Loss Curves by cluster, noise and ", instance, " % of instances altered")) +
    theme_bw() +
    scale_y_continuous(limits = c(0.0, 0.5), breaks = seq(0, 1, by = 0.1))
  
  # Print plot
  print(p2)
}

# Create an empty list to store plots
plot_list <- list()

# Create all plots and store them in the list
for(i in seq_along(instances_names)) {
  instance <- instances_names[i]
  
  # Filter data for both techniques and clusters
  filtered_tech_data <- subset(meanKLC_with_clusters, percentage == instance)
  filtered_cluster_data <- subset(cluster_means, percentage == instance)
  
  # Create combined plot with consistent colors
  combined_plot <- ggplot() +
    # Add technique lines with colors based on their cluster
    geom_line(data = filtered_tech_data, 
              aes(x = noise, y = kappa_loss, group = technique, color = factor(cluster)),
              linetype = "solid", alpha = 0.5) +
    geom_point(data = filtered_tech_data,
               aes(x = noise, y = kappa_loss, group = technique, color = factor(cluster)),
               alpha = 0.5) +
    
    # Add thicker cluster lines to show the averages
    geom_line(data = filtered_cluster_data,
              aes(x = noise, y = kappa_loss, group = cluster, color = factor(cluster)),
              linewidth = 1.5) +
    geom_point(data = filtered_cluster_data,
               aes(x = noise, y = kappa_loss, group = cluster, color = factor(cluster)),
               size = 3) +
    
    # Set the specific color mapping - consistent with other plots
    scale_color_manual(name = "Cluster", values = cluster_colors) +
    
    # Customize the plot
    scale_y_continuous(limits = c(0.0, 0.5), breaks = seq(0, 1, by = 0.1)) +
    labs(x = "Noise",
         y = "Kappa Loss",
         title = paste0(instance, "% of instances altered")) +
    theme_bw() +
    theme(legend.position = "right")
  
  # Store plot in list
  plot_list[[i]] <- combined_plot
}

# Arrange all plots in a grid using patchwork
if (requireNamespace("patchwork", quietly = TRUE)) {
  # Using patchwork
  library(patchwork)
  combined_grid <- wrap_plots(plot_list, ncol = 1) + 
    plot_annotation(title = "Kappa Loss Curves by Technique and Cluster")
  print(combined_grid)
  
  # Save the grid plot
  png(filename = "../../results/plots/cluster_curves_grid2.png", 
      width = 4000, height = 12000, res = 600)
  print(combined_grid)
  dev.off()
} else {
  # Print plots individually if patchwork is not available
  for (p in plot_list) {
    print(p)
  }
}

## quartz_off_screen 
##                 2